packages <- c("CIMseq", "CIMseq.data", "tidyverse", "scSeqTools", "printr")
purrr::walk(packages, library, character.only = TRUE)
rm(packages)
##DATA
load('../data/CIMseqData.rda')
load('../data/sObj.rda')
#rename classes
renameClasses <- function(class) {
case_when(
class == "0" ~ "C.Stem.proximal",
class == "1" ~ "C.Stem.distal",
class == "2" ~ "SI.Stem",
class == "3" ~ "C.Goblet.distal",
class == "4" ~ "SI.Enterocytes",
class == "5" ~ "SI.Goblet",
class == "6" ~ "C.Goblet.proximal",
class == "7" ~ "Enteroendocrine",
class == "8" ~ "Tufft",
class == "9" ~ "SI.Paneth",
class == "10" ~ "Blood",
TRUE ~ "error"
)
}
cOrder <- c(
"C.Stem.proximal", "C.Goblet.proximal", "C.Stem.distal", "C.Goblet.distal",
"SI.Goblet", "SI.Paneth", "SI.Stem", "SI.Enterocytes",
"Enteroendocrine", "Tufft", "Blood"
)
getData(cObjSng, "classification") <- renameClasses(getData(cObjSng, "classification"))
features <- getData(cObjMul, "features")
names(features) <- renameClasses(names(features))
getData(cObjMul, "features") <- features
fractions <- getData(sObj, "fractions")
colnames(fractions) <- renameClasses(colnames(fractions))
sObj@fractions <- fractions
plotUnsupervisedClass(cObjSng, cObjMul)
plotUnsupervisedMarkers(
cObjSng, cObjMul,
c("Lgr5", "Ptprc", "Chga", "Dclk1", "Alpi", "Slc26a3", "Atoh1", "Lyz1"),
pal = RColorBrewer::brewer.pal(8, "Set1")
)
plotUnsupervisedMarkers(
cObjSng, cObjMul, "Hoxb13",
pal = RColorBrewer::brewer.pal(8, "Set1")
)
plotUnsupervisedMarkers(
cObjSng, cObjMul, "Mki67",
pal = RColorBrewer::brewer.pal(8, "Set1")
)
classHeatmap(
data = data.frame(
gene = rownames(getData(cObjMul, "counts"))[getData(cObjMul, "features")],
class = names(getData(cObjMul, "features"))
),
counts.log = getData(cObjSng, "counts.log"),
classes = tibble(
sample = colnames(getData(cObjSng, "counts")),
class = getData(cObjSng, "classification")
)
)
Show top 10 (by fold change) features per class.
violinPlot <- function(cpm, genes, classes, class) {
cpm[genes, ] %>%
t() %>%
matrix_to_tibble("sample") %>%
gather(gene, cpm, -sample) %>%
full_join(tibble(sample = colnames(cpm), class = classes), by = "sample") %>%
ggplot() +
geom_jitter(aes(class, cpm), height = 0, size = 0.1, alpha = 0.25) +
facet_wrap(~gene, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = paste0(class, " genes"))
}
getGenes <- function(
cpm, classes, features, class, ntop = 10
) {
fc <- foldChangePerClass(
cpm[features[names(features) == class], ],
tibble(sample = colnames(cpm), class = classes)
)
rownames(fc[order(fc[, class], decreasing = TRUE), ])[1:ntop]
}
cpm <- getData(cObjSng, "counts.cpm")
c <- "C.Stem.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "C.Stem.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "C.Goblet.distal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "C.Goblet.proximal"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "SI.Stem"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "SI.Goblet"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "SI.Enterocytes"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
c <- "SI.Paneth"
g <- getGenes(cpm, getData(cObjSng, "classification"), getData(cObjMul, "features"), c)
violinPlot(cpm, g, getData(cObjSng, "classification"), c)
plotSwarmCircos(sObj, cObjSng, cObjMul, weightCut = 0, classOrder = cOrder)
Only detected duplicates and triplicates.
Only ERCC estimated cell number <= 4.
Weight cutoff = 5.
adj <- adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE)
samples <- rownames(adj)
rs <- rowSums(adj)
keep <- rs == 2 | rs == 3
plotSwarmCircos(
filterSwarm(sObj, keep), cObjSng, cObjMul, weightCut = 5,
classOrder = cOrder, theoretical.max = 4
)
efm <- getEdgesForMultiplet(sObj, cObjSng, cObjMul, theoretical.max = 4) %>%
mutate(conn = map2(from, to, ~tibble(
from = sort(c(.x, .y))[1], to = sort(c(.x, .y))[2]
))) %>%
select(-from, -to) %>%
unnest() %>%
distinct()
# fp <- efm %>%
# mutate(fp = case_when(
# str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
# str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
# str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
# str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
# TRUE ~ FALSE
# )) %>%
# group_by(sample) %>%
# summarize(fp = sum(fp)) %>%
# {setNames(pull(., fp), pull(., sample))}
fp <- efm %>%
inner_join(select(MGA.Meta, sample, sub_tissue)) %>%
mutate(fp = case_when(
(str_detect(from, "^C") | str_detect(to, "^C")) & sub_tissue == "small_intestine" ~ TRUE,
(str_detect(from, "^SI") | str_detect(to, "^SI")) & sub_tissue == "colon" ~ TRUE,
TRUE ~ FALSE
)) %>%
group_by(sample) %>%
summarize(fp = sum(fp)) %>%
{setNames(pull(., fp), pull(., sample))}
## Joining, by = "sample"
Detected 461 false positive connections out of 1354 total connections. Of those multiplets with a detected connection, 284 multiplets have at least one false positive out of 1177 total multiplets. The per multiplet false positive distribution is shown in the plot below.
ggplot() +
geom_bar(aes(fp)) +
theme_bw() +
labs(x = "False positives per multiplet") +
scale_x_continuous(breaks = 0:max(fp))
The type of false positive connections and their amount are shown below.
typeFreq <- efm %>%
mutate(fp = case_when(
str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
TRUE ~ FALSE
)) %>%
filter(fp) %>%
count(from, to) %>%
arrange(desc(n)) %>%
as.data.frame()
typeFreq
| from | to | n |
|---|---|---|
| C.Stem.distal | C.Stem.proximal | 288 |
| C.Goblet.distal | C.Stem.proximal | 223 |
| C.Stem.proximal | SI.Stem | 147 |
| C.Goblet.distal | C.Goblet.proximal | 111 |
| C.Goblet.proximal | C.Stem.distal | 90 |
| C.Stem.proximal | SI.Goblet | 75 |
| C.Stem.distal | SI.Enterocytes | 47 |
| C.Stem.distal | SI.Goblet | 41 |
| C.Goblet.distal | SI.Goblet | 30 |
| C.Goblet.proximal | SI.Goblet | 25 |
| C.Stem.distal | SI.Stem | 24 |
| C.Stem.proximal | SI.Enterocytes | 24 |
| C.Goblet.distal | SI.Stem | 11 |
| C.Goblet.distal | SI.Enterocytes | 9 |
| C.Stem.proximal | SI.Paneth | 7 |
| C.Goblet.proximal | SI.Stem | 5 |
| C.Goblet.distal | SI.Paneth | 1 |
| C.Goblet.proximal | SI.Enterocytes | 1 |
#number of features corresponding to class
#total sum of counts for features corresponding to class
# features <- getData(cObjMul, "features")
# cpm <- getData(cObjSng, "counts.cpm")
# classifications <- getData(cObjSng, "classification")
# rs <- rowSums(cpm)
# table(names(features))
#
# geneDat <- tibble(class = names(features), gene = rownames(getData(cObjSng, "counts"))[features]) %>%
# mutate(all = rs[gene]) %>%
# mutate(classOnly = map2_dbl(class, gene, function(c, g) {
# sum(cpm[g, classifications == c])
# })) %>%
# group_by(class) %>%
# summarize(sum.all = sum(all), sum.classOnly = sum(classOnly))
#
# inner_join(typeFreq, geneDat, by = c("from" = "class")) %>%
# inner_join(geneDat, by = c("to" = "class")) %>%
# mutate(
# sum.all = map2_dbl(sum.all.x, sum.all.y, ~sum(c(.x, .y))),
# sum.classOnly = map2_dbl(sum.classOnly.x, sum.classOnly.y, ~sum(c(.x, .y)))
# ) %>%
# ggplot() +
# geom_point(aes(sum.all, n))
dc <- rowSums(adjustFractions(cObjSng, cObjMul, sObj))
tibble(
sample = names(fp),
`False positives (n)` = fp,
`Deconvolution detected connections (n)` = dc[names(fp)]
) %>%
group_by(`Deconvolution detected connections (n)`) %>%
summarize(`False positives (n)` = sum(`False positives (n)`)) %>%
as.data.frame()
| Deconvolution detected connections (n) | False positives (n) |
|---|---|
| 2 | 140 |
| 3 | 198 |
| 4 | 103 |
| 5 | 20 |
| 6 | 0 |
Correlation between various parameters and the number of false positives is shown below:
mulNames <- names(fp) #note other multiplets have 0 connections
ec <- estimateCells(cObjSng, cObjMul, warn = FALSE) %>%
filter(sample %in% mulNames) %>%
{setNames(pull(., estimatedCellNumber), pull(., sample))}
ec <- ec[mulNames]
dec <- rowSums(adjustFractions(cObjSng, cObjMul, sObj, binary = TRUE))
dec <- dec[mulNames]
c <- getData(sObj, "costs")
c <- c[mulNames]
tc <- colSums(getData(cObjMul, "counts"))
tc <- tc[mulNames]
dg <- apply(getData(cObjMul, "counts"), 2, function(c) length(which(c != 0)))
dg <- dg[mulNames]
feat.do <- getData(cObjMul, "counts.cpm")[features, ] %>%
matrix_to_tibble("gene") %>%
gather(sample, cpm, -gene) %>%
group_by(sample) %>%
summarize(do = length(which(cpm == 0))) %>%
{setNames(pull(., do), pull(., sample))}
feat.do <- feat.do[mulNames]
cormat <- cor(matrix(c(fp, ec, dec, c, tc, dg, feat.do), ncol = 7))
corvec <- cormat[2:nrow(cormat), 1]
names(corvec) <- c(
"ERCC estimated cell number", "Deconvolution estimated cell number",
"Deconvolution cost", "Total counts", "Total detected genes",
"Drop-outs in features"
)
data.frame(cor = corvec)
| cor | |
|---|---|
| ERCC estimated cell number | 0.3647011 |
| Deconvolution estimated cell number | 0.4474336 |
| Deconvolution cost | 0.1633668 |
| Total counts | 0.2323673 |
| Total detected genes | 0.2731302 |
| Drop-outs in features | -0.2272327 |
cpm <- getData(cObjMul, "counts.cpm")
features <- getData(cObjMul, "features")
cpm[features, ] %>%
matrix_to_tibble("gene") %>%
gather(sample, cpm, -gene) %>%
inner_join(tibble(class = names(features), gene = rownames(cpm)[features])) %>%
group_by(sample, class) %>%
summarize(cpm.sum = sum(cpm)) %>%
ungroup() %>%
inner_join(tibble(sample = names(fp), fp = fp)) %>%
ggplot() +
geom_point(aes(cpm.sum, fp)) +
facet_wrap(~class)
#multiplets with high fp look to have low expression of the selected features
Distribution of fractions in connections thought to be true vs connections known to be false.
fractions <- getData(sObj, "fractions")
efm %>%
mutate(fp = case_when(
str_detect(from, "^SI") & str_detect(to, "^C") ~ TRUE,
str_detect(to, "^SI") & str_detect(from, "^C") ~ TRUE,
str_detect(from, "proximal") & str_detect(to, "distal") ~ TRUE,
str_detect(to, "proximal") & str_detect(from, "distal") ~ TRUE,
TRUE ~ FALSE
)) %>%
mutate(fracs = pmap(list(sample, from, to), function(s, f, t) {
fractions[s, c(f, t)]
})) %>%
unnest() %>%
ggplot() +
geom_boxplot(aes(fp, fracs)) +
labs(x = "False positive", y = "Deconvolution fraction") +
theme_bw()
Cluster multiplets and look for an overrepresentaion of false positives.
p.dist <- CIMseq::pearsonsDist(getData(cObjMul, "counts.cpm"), CIMseq::selectTopMax(getData(cObjMul, "counts.cpm"), 2000))
set.seed(234899)
u <- uwot::umap(p.dist)
rownames(u) <- colnames(getData(cObjMul, "counts"))
u %>%
matrix_to_tibble("sample") %>%
inner_join(tibble(sample = names(fp), fp = fp)) %>%
mutate(fp.bool = fp > 0) %>%
ggplot() +
geom_point(aes(V1, V2, colour = fp.bool)) +
theme_bw() +
labs(x = "UMAP1", y = "UMAP2") +
guides(colour = guide_legend(title = "Has false positive?"))
## Joining, by = "sample"
u %>%
matrix_to_tibble("sample") %>%
inner_join(tibble(sample = names(fp), fp = fp)) %>%
ggplot() +
geom_point(aes(V1, V2, colour = fp)) +
theme_bw() +
scale_colour_viridis_c() +
labs(x = "UMAP1", y = "UMAP2") +
guides(colour = guide_legend(title = "False positives (n)"))
## Joining, by = "sample"